# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import JobboleArticleItem, ArticleItemLoader
from ArticleSpider.utils.common import get_md5
from scrapy.loader import ItemLoader

class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):
        """
        1、获取文件列表也中的url交给scrapy下载后解析
        2、获取下一页的url交给scrapy下载解析
        :param response:
        :return:
        """
        # 获取页面中文章列表的url，下载进行解析
        page_nodes = response.css("#archive .floated-thumb .post-thumb a")
        for page_node in page_nodes:
            page_url = page_node.css("::attr(href)").extract_first()
            front_image_url = page_node.css("img::attr(src)").extract_first('')
            yield Request(url=parse.urljoin(response.url, page_url), callback=self.parse_detail, meta={'front_image_url': front_image_url})
        # 获取页面中的下一页的链接
        next_page_url = response.css('.next.page-numbers::attr(href)').extract_first("")
        if next_page_url:
            yield Request(url=parse.urljoin(response.url, next_page_url), callback=self.parse)

    def parse_detail(self, response):
        """
        解析具体的文章页面
        :param response:
        :return:
        """
        article_item = JobboleArticleItem()

        # region 使用xpath的方式获取数据
        # 获取网页中的标题，创建时间，标签，点赞数，收藏数，评论数，正文
        # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
        # createDate = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·', '')
        # tag_lists = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
        # tag_lists = [element for element in tag_lists if not element.strip().endswith("评论")]
        # tags = ",".join(tag_lists)
        # praise_nums = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])
        # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
        # match_re = re.match(".*(\d+).*", fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # commont_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
        # match_re = re.match(".*(\d+).*", commont_nums)
        # if match_re:
        #     commont_nums = int(match_re.group(1))
        # else:
        #     commont_nums = 0
        # content = response.xpath('//div[@class="entry"]').extract()[0]
        # endregion

        #region 使用CSS选择器获取数据
        # 通过css选择器来获取对应的数据
        # title = response.css(".entry-header h1::text").extract()[0]
        # front_image_url = response.meta.get("front_image_url", "")  # 文章的封面图片
        # createDate = response.css(".entry-meta-hide-on-mobile::text")[0].extract().strip().replace('·', '')
        # tag_lists = response.css(".entry-meta-hide-on-mobile a::text").extract()
        # tag_lists = [element for element in tag_lists if not element.strip().endswith("评论")]
        # tags = ','.join(tag_lists)
        # praise_nums = int(response.css("span[class*='vote-post-up'] h10::text").extract()[0])
        # fav_nums = response.css("span[class*='bookmark-btn']::text").extract()[0]
        # match_re = re.match('.*(\d+).*', fav_nums)
        # if match_re:
        #     fav_nums = int(match_re.group(1))
        # else:
        #     fav_nums = 0
        # commont_nums = response.css("a[href='#article-comment'] > span::text").extract()[0]
        # match_re = re.match('.*(\d+).*', commont_nums)
        # if match_re:
        #     commont_nums = int(match_re.group(1))
        # else:
        #     commont_nums = 0
        # content = response.css('div.entry').extract_first('')
        # article_item['title'] = title
        # article_item['createDate'] = createDate
        # article_item['url_object_id'] = get_md5(response.url)
        # article_item['url'] = response.url
        # article_item['front_image_url'] = [front_image_url]
        # article_item['praise_nums'] = praise_nums
        # article_item['commont_nums'] = commont_nums
        # article_item['fav_nums'] = fav_nums
        # article_item['tags'] = tags
        # article_item['content'] =content
        #endregion

        #region 使用scrapy内置的itemloader的方式处理数据的获取

        #endregion
        front_image_url = response.meta.get("front_image_url", "")  # 文章的封面图片
        item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response)
        item_loader.add_css('title', '.entry-header h1::text')
        item_loader.add_css('createDate', '.entry-meta-hide-on-mobile::text')
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_value('url', response.url)
        item_loader.add_value('front_image_url', [front_image_url])
        item_loader.add_css('praise_nums', "span[class*='vote-post-up'] h10::text")
        item_loader.add_css('commont_nums', "a[href='#article-comment'] > span::text")
        item_loader.add_css('fav_nums', "span[class*='bookmark-btn']::text")
        item_loader.add_css('tags', ".entry-meta-hide-on-mobile a::text")
        item_loader.add_css('content', "div.entry")

        article_item = item_loader.load_item()

        yield article_item

